{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<center>\n",
    "<img src=\"../../img/ods_stickers.jpg\">\n",
    "## Открытый курс по машинному обучению\n",
    "</center>\n",
    "Автор материала: аналитик-разработчик в команде Яндекс.Метрики Мария Мансурова. Материал распространяется на условиях лицензии [Creative Commons CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/). Можно использовать в любых целях (редактировать, поправлять и брать за основу), кроме коммерческих, но с обязательным упоминанием автора материала.\n",
    "\n",
    "#### <center>К [статье](https://habrahabr.ru/company/ods/blog/323730/) на Хабре \"Предсказываем будущее с помощью библиотеки Facebook Prophet\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "import os\n",
    "\n",
    "import pandas as pd\n",
    "from plotly import __version__\n",
    "\n",
    "print(__version__)  # need 1.9.0 or greater\n",
    "import pandas as pd\n",
    "import requests\n",
    "from plotly import graph_objs as go\n",
    "from plotly.offline import download_plotlyjs, init_notebook_mode, iplot, plot\n",
    "\n",
    "init_notebook_mode(connected=True)\n",
    "\n",
    "\n",
    "def plotly_df(df, title=\"\"):\n",
    "    data = []\n",
    "\n",
    "    for column in df.columns:\n",
    "        trace = go.Scatter(x=df.index, y=df[column], mode=\"lines\", name=column)\n",
    "        data.append(trace)\n",
    "\n",
    "    layout = dict(title=title)\n",
    "    fig = dict(data=data, layout=layout)\n",
    "    iplot(fig, show_link=False)\n",
    "\n",
    "\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "import statsmodels.api as sm\n",
    "from scipy import stats"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Загрузка и предобработка данных\n",
    "Данные [соревнования](https://inclass.kaggle.com/c/howpop-habrahabr-favs-lognorm) по прогнозу популярности статьи на Хабрахабре."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "habr_df = pd.read_csv(\"../../data/howpop_train.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "habr_df[\"published\"] = pd.to_datetime(habr_df.published)\n",
    "habr_df = habr_df[[\"published\", \"url\"]]\n",
    "habr_df = habr_df.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "aggr_habr_df = habr_df.groupby(\"published\")[[\"url\"]].count()\n",
    "aggr_habr_df.columns = [\"posts\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "aggr_habr_df = aggr_habr_df.resample(\"D\").apply(sum)\n",
    "plotly_df(\n",
    "    aggr_habr_df.resample(\"W\").apply(sum), title=\"Опубликованные посты на Хабрахабре\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Построение прогноза Prophet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pip install pystan\n",
    "# pip install fbprophet\n",
    "from fbprophet import Prophet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions = 30\n",
    "\n",
    "df = aggr_habr_df.reset_index()\n",
    "df.columns = [\"ds\", \"y\"]\n",
    "df.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = df[:-predictions]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "m = Prophet()\n",
    "m.fit(train_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "future = m.make_future_dataframe(periods=30)\n",
    "future.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "forecast = m.predict(future)\n",
    "forecast.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\", \".join(forecast.columns))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "m.plot(forecast)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "m.plot_components(forecast)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Оценка качества Prophet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cmp_df = forecast.set_index(\"ds\")[[\"yhat\", \"yhat_lower\", \"yhat_upper\"]].join(\n",
    "    df.set_index(\"ds\")\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "cmp_df[\"e\"] = cmp_df[\"y\"] - cmp_df[\"yhat\"]\n",
    "cmp_df[\"p\"] = 100 * cmp_df[\"e\"] / cmp_df[\"y\"]\n",
    "np.mean(abs(cmp_df[-predictions:][\"p\"])), np.mean(abs(cmp_df[-predictions:][\"e\"]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Прогноз с BoxCox"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def invboxcox(y, lmbda):\n",
    "    if lmbda == 0:\n",
    "        return np.exp(y)\n",
    "    else:\n",
    "        return np.exp(np.log(lmbda * y + 1) / lmbda)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df2 = train_df.copy().fillna(14)\n",
    "train_df2 = train_df2.set_index(\"ds\")\n",
    "train_df2[\"y\"], lmbda_prophet = stats.boxcox(train_df2[\"y\"])\n",
    "\n",
    "train_df2.reset_index(inplace=True)\n",
    "\n",
    "m2 = Prophet()\n",
    "m2.fit(train_df2)\n",
    "future2 = m2.make_future_dataframe(periods=30)\n",
    "\n",
    "forecast2 = m2.predict(future2)\n",
    "forecast2[\"yhat\"] = invboxcox(forecast2.yhat, lmbda_prophet)\n",
    "forecast2[\"yhat_lower\"] = invboxcox(forecast2.yhat_lower, lmbda_prophet)\n",
    "forecast2[\"yhat_upper\"] = invboxcox(forecast2.yhat_upper, lmbda_prophet)\n",
    "\n",
    "cmp_df2 = forecast2.set_index(\"ds\")[[\"yhat\", \"yhat_lower\", \"yhat_upper\"]].join(\n",
    "    df.set_index(\"ds\")\n",
    ")\n",
    "\n",
    "cmp_df2[\"e\"] = cmp_df2[\"y\"] - cmp_df2[\"yhat\"]\n",
    "cmp_df2[\"p\"] = 100 * cmp_df2[\"e\"] / cmp_df2[\"y\"]\n",
    "np.mean(abs(cmp_df2[-predictions:][\"p\"])), np.mean(abs(cmp_df2[-predictions:][\"e\"]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Визуализация результатов"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def show_forecast(cmp_df, num_predictions, num_values):\n",
    "    upper_bound = go.Scatter(\n",
    "        name=\"Upper Bound\",\n",
    "        x=cmp_df.tail(num_predictions).index,\n",
    "        y=cmp_df.tail(num_predictions).yhat_upper,\n",
    "        mode=\"lines\",\n",
    "        marker=dict(color=\"444\"),\n",
    "        line=dict(width=0),\n",
    "        fillcolor=\"rgba(68, 68, 68, 0.3)\",\n",
    "        fill=\"tonexty\",\n",
    "    )\n",
    "\n",
    "    forecast = go.Scatter(\n",
    "        name=\"Prediction\",\n",
    "        x=cmp_df.tail(predictions).index,\n",
    "        y=cmp_df.tail(predictions).yhat,\n",
    "        mode=\"lines\",\n",
    "        line=dict(color=\"rgb(31, 119, 180)\"),\n",
    "    )\n",
    "\n",
    "    lower_bound = go.Scatter(\n",
    "        name=\"Lower Bound\",\n",
    "        x=cmp_df.tail(num_predictions).index,\n",
    "        y=cmp_df.tail(num_predictions).yhat_lower,\n",
    "        marker=dict(color=\"444\"),\n",
    "        line=dict(width=0),\n",
    "        mode=\"lines\",\n",
    "    )\n",
    "\n",
    "    fact = go.Scatter(\n",
    "        name=\"Fact\",\n",
    "        x=cmp_df.tail(num_values).index,\n",
    "        y=cmp_df.tail(num_values).y,\n",
    "        marker=dict(color=\"red\"),\n",
    "        mode=\"lines\",\n",
    "    )\n",
    "\n",
    "    # Trace order can be important\n",
    "    # with continuous error bars\n",
    "    data = [lower_bound, upper_bound, forecast, fact]\n",
    "\n",
    "    layout = go.Layout(\n",
    "        yaxis=dict(title=\"Посты\"),\n",
    "        title=\"Опубликованные посты на Хабрахабре\",\n",
    "        showlegend=False,\n",
    "    )\n",
    "\n",
    "    fig = go.Figure(data=data, layout=layout)\n",
    "    iplot(fig, show_link=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "show_forecast(cmp_df, predictions, 200)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "##  Сравнение с ARIMA моделью"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = train_df.fillna(14).set_index(\"ds\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(15, 10))\n",
    "sm.tsa.seasonal_decompose(train_df[\"y\"].values, freq=7).plot()\n",
    "print(\"Критерий Дики-Фуллера: p=%f\" % sm.tsa.stattools.adfuller(train_df[\"y\"])[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df.index = pd.to_datetime(train_df.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df[\"y_box\"], lmbda = stats.boxcox([1 if x == 0 else x for x in train_df[\"y\"]])\n",
    "plt.figure(figsize=(15, 7))\n",
    "train_df.y.plot()\n",
    "plt.ylabel(u\"Posts on Habr\")\n",
    "print(\"Оптимальный параметр преобразования Бокса-Кокса: %f\" % lmbda)\n",
    "print(\"Критерий Дики-Фуллера: p=%f\" % sm.tsa.stattools.adfuller(train_df[\"y\"])[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df[\"y_box_diff\"] = train_df.y_box - train_df.y_box.shift(7)\n",
    "plt.figure(figsize=(15, 10))\n",
    "sm.tsa.seasonal_decompose(train_df.y_box_diff[12:].values, freq=7).plot()\n",
    "print(\n",
    "    \"Критерий Дики-Фуллера: p=%f\"\n",
    "    % sm.tsa.stattools.adfuller(train_df.y_box_diff[8:])[1]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(15, 8))\n",
    "ax = plt.subplot(211)\n",
    "sm.graphics.tsa.plot_acf(train_df.y_box_diff[13:].values.squeeze(), lags=48, ax=ax)\n",
    "ax = plt.subplot(212)\n",
    "sm.graphics.tsa.plot_pacf(train_df.y_box_diff[13:].values.squeeze(), lags=48, ax=ax)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Начальные приближения Q = 1, q = 4, P = 5, p = 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ps = range(0, 4)\n",
    "d = 1\n",
    "qs = range(0, 5)\n",
    "Ps = range(0, 7)\n",
    "D = 1\n",
    "Qs = range(0, 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from itertools import product\n",
    "\n",
    "parameters = product(ps, qs, Ps, Qs)\n",
    "parameters_list = list(parameters)\n",
    "len(parameters_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "results = []\n",
    "best_aic = float(\"inf\")\n",
    "\n",
    "\n",
    "for param in parameters_list:\n",
    "    print(param)\n",
    "    # try except нужен, потому что на некоторых наборах параметров модель не обучается\n",
    "    try:\n",
    "        %time model = sm.tsa.statespace.SARIMAX(\\\n",
    "                  train_df.y_box,\\\n",
    "                  order=(param[0], d, param[1]),\\\n",
    "                  seasonal_order=(param[2], D, param[3], 7),\\\n",
    "              ).fit(\\\n",
    "                  disp=-1\\\n",
    "              )\n",
    "    # выводим параметры, на которых модель не обучается и переходим к следующему набору\n",
    "    except ValueError:\n",
    "        print(\"wrong parameters:\", param)\n",
    "        continue\n",
    "    aic = model.aic\n",
    "    # сохраняем лучшую модель, aic, параметры\n",
    "    if aic < best_aic:\n",
    "        best_model = model\n",
    "        best_aic = aic\n",
    "        best_param = param\n",
    "    results.append([param, model.aic])\n",
    "\n",
    "warnings.filterwarnings(\"default\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result_table = pd.DataFrame(results)\n",
    "result_table.columns = [\"parameters\", \"aic\"]\n",
    "print(result_table.sort_values(by=\"aic\", ascending=True).head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(best_model.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(15, 8))\n",
    "plt.subplot(211)\n",
    "best_model.resid[13:].plot()\n",
    "plt.ylabel(u\"Residuals\")\n",
    "\n",
    "ax = plt.subplot(212)\n",
    "sm.graphics.tsa.plot_acf(best_model.resid[13:].values.squeeze(), lags=48, ax=ax)\n",
    "\n",
    "print(\"Критерий Стьюдента: p=%f\" % stats.ttest_1samp(best_model.resid[13:], 0)[1])\n",
    "print(\n",
    "    \"Критерий Дики-Фуллера: p=%f\" % sm.tsa.stattools.adfuller(best_model.resid[13:])[1]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df[\"arima_model\"] = invboxcox(best_model.fittedvalues, lmbda)\n",
    "plt.figure(figsize=(15, 7))\n",
    "train_df.y.tail(200).plot()\n",
    "train_df.arima_model[13:].tail(200).plot(color=\"r\")\n",
    "plt.ylabel(\"Posts on Habr\");"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "arima_df = train_df2.set_index(\"ds\")[[\"y\"]]\n",
    "\n",
    "date_list = [\n",
    "    pd.datetime.strptime(\"2016-10-01\", \"%Y-%m-%d\") + pd.Timedelta(x)\n",
    "    for x in range(0, predictions + 1)\n",
    "]\n",
    "future = pd.DataFrame(index=date_list, columns=arima_df.columns)\n",
    "arima_df = pd.concat([arima_df, future])\n",
    "arima_df[\"forecast\"] = invboxcox(\n",
    "    best_model.predict(\n",
    "        start=train_df.shape[0], end=train_df.shape[0] + predictions - 1\n",
    "    ),\n",
    "    lmbda,\n",
    ")\n",
    "plt.figure(figsize=(15, 7))\n",
    "arima_df.y.tail(200).plot()\n",
    "arima_df.forecast.tail(200).plot(color=\"r\")\n",
    "plt.ylabel(\"Habr posts\");"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cmp_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cmp_df = cmp_df.join(arima_df[[\"forecast\"]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "cmp_df[\"e_arima\"] = cmp_df[\"y\"] - cmp_df[\"forecast\"]\n",
    "cmp_df[\"p_arima\"] = 100 * cmp_df[\"e_arima\"] / cmp_df[\"y\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_values = 200\n",
    "\n",
    "forecast = go.Scatter(\n",
    "    name=\"Prophet\",\n",
    "    x=cmp_df.tail(predictions).index,\n",
    "    y=cmp_df.tail(predictions).yhat,\n",
    "    mode=\"lines\",\n",
    "    line=dict(color=\"rgb(31, 119, 180)\"),\n",
    ")\n",
    "\n",
    "\n",
    "fact = go.Scatter(\n",
    "    name=\"Fact\",\n",
    "    x=cmp_df.tail(num_values).index,\n",
    "    y=cmp_df.tail(num_values).y,\n",
    "    marker=dict(color=\"red\"),\n",
    "    mode=\"lines\",\n",
    ")\n",
    "\n",
    "arima = go.Scatter(\n",
    "    name=\"ARIMA\",\n",
    "    x=cmp_df.tail(predictions).index,\n",
    "    y=cmp_df.tail(predictions).forecast,\n",
    "    mode=\"lines\",\n",
    ")\n",
    "\n",
    "# Trace order can be important\n",
    "# with continuous error bars\n",
    "data = [forecast, fact, arima]\n",
    "\n",
    "layout = go.Layout(\n",
    "    yaxis=dict(title=\"Посты\"),\n",
    "    title=\"Опубликованные посты на Хабрахабре\",\n",
    "    showlegend=True,\n",
    ")\n",
    "\n",
    "fig = go.Figure(data=data, layout=layout)\n",
    "iplot(fig, show_link=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
